{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/model_analysis/feature_statistics_tutorial.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import catboost\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.datasets import make_regression\n",
    "%pylab inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example on generated data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "n_features = 3\n",
    "X, y = make_regression(n_samples=1000, n_features=10, n_informative=n_features, random_state=0)\n",
    "plt.scatter(X[:, 0], X[:, 1], c=y)\n",
    "X = pd.DataFrame(X)\n",
    "X.columns = ['Column_{}'.format(i) for i in range(X.shape[1])]\n",
    "\n",
    "cat_values_1 = ['A', 'B', 'C']\n",
    "cat_values_2 = ['some', 'random', 'categorical', 'feature', 'values', 'testing']\n",
    "X.loc[:, 'CatColumn_1'] = [cat_values_1[np.random.randint(0, len(cat_values_1))] for _ in range(X.shape[0])]\n",
    "X.loc[:, 'CatColumn_2'] = [cat_values_2[np.random.randint(0, len(cat_values_2))] for _ in range(X.shape[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.sample(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train model and plot statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "model = catboost.CatBoostRegressor(cat_features=['CatColumn_1', 'CatColumn_2'], \n",
    "                                   one_hot_max_size=300, iterations=500)\n",
    "model.fit(X, y, silent=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Float feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_num = 'Column_3'\n",
    "res = model.calc_feature_statistics(X, y, feature_num, plot=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### One-Hot feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_num = 'CatColumn_2'\n",
    "res = model.calc_feature_statistics(X, y, feature_num, cat_feature_values=cat_values_2, plot=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test on Titanic dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.datasets import titanic\n",
    "\n",
    "titanic_train, titanic_test = titanic()\n",
    "titanic_train_target = titanic_train.Survived\n",
    "titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)\n",
    "titanic_train.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "titanic_model = catboost.CatBoostClassifier(\n",
    "    iterations=200, \n",
    "    cat_features=['Pclass', 'Sex', 'SibSp'], \n",
    "    one_hot_max_size=10)\n",
    "titanic_model.fit(titanic_train, titanic_train_target, silent=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "titanic_train.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Float feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature = 'Fare'\n",
    "res = titanic_model.calc_feature_statistics(titanic_train, titanic_train_target, feature, plot=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### One-hot feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature = 'Sex'\n",
    "res = titanic_model.calc_feature_statistics(titanic_train, titanic_train_target, feature, plot=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
